## prepare release data as files by dam

prep_selected_release_files <- function(daily_release_selected){


  tar_read(daily_release_selected) |> 
    split(~RHPID) |> 
    map(function(dam_release_table){

      dam_release_table[["RHPID"]][1] -> RHPID
      
      write_parquet(
        dam_release_table,
        paste0("data/internals/daily_release_selected_by_dam/",
               gsub("/", "-", RHPID), ".parquet")
      )

    })
  
  
  return("data/internals/daily_release_selected_by_dam/")
  
}

aggregate_EIA_gen_to_RHPID <- function(target_plants_mapped_to_water,
                                       gen_1980_2022_monthly_with_freq,
                                       QFER_gen){

  expand.grid(year = reanalysis_years,
              RHPID = unique(target_plants_mapped_to_water[["RHPID"]]),
              month = month(1:12, label = T)) |>
      as_tibble() |> mutate(na_ = NA_real_, RHPID = as.character(RHPID)) |>
      pivot_wider(names_from = month, values_from = na_) |>
      mutate(source = NA_character_) ->
      obs_grid_blank

  gen_1980_2022_monthly_with_freq |> 
    left_join(target_plants_mapped_to_water |> 
                select(EIA_ID, RHPID),
              by = join_by(EIA_ID)) |>
    mutate(freq = factor(freq, levels = c("No data",
                                          "Annual survey",
                                          "Monthly survey"),
                         ordered = T)) |> 
    arrange(year, RHPID, freq) |>
    summarise(
      freq = first(freq),
      Jan = sum(Jan), Feb = sum(Feb), Mar = sum(Mar),
      Apr = sum(Apr), May = sum(May), Jun = sum(Jun),
      Jul = sum(Jul), Aug = sum(Aug), Sep = sum(Sep),
      Oct = sum(Oct), Nov = sum(Nov), Dec = sum(Dec),
      .by = c(year, RHPID)
      ) -> EIA_monthly_gen_all_freq
    
  # grab only actual monthly observations for each site!
  EIA_monthly_gen_all_freq |> 
    filter(freq == "Monthly survey",
    year %in% reanalysis_years) |> 
    select(-freq) |> 
    arrange(RHPID, year) |> 
    # remove rows with any NAs
    filter(!if_any(everything(), is.na)) |>
    mutate(source = "EIA-923_M") ->
    EIA_monthly_gen_obs_only

  # introduce the CEC data
  QFER_gen |>
    select(EIAPlantID, Year, Month, Unit, NetMWh) |>
    mutate(EIA_ID = as.integer(EIAPlantID),
           year = as.integer(Year),
           month = month(Month, label = T)) |>
    left_join(target_plants_mapped_to_water |> 
                select(RHPID, EIA_ID)) |>
    # remove cases not included in RHP (retired, < 10MW, etc)
    filter(!is.na(RHPID), year %in% reanalysis_years) |> 
    summarise(NetMWh = sum(NetMWh), .by = c(RHPID, month, year)) |>
    pivot_wider(names_from = month, values_from = NetMWh) |>
    mutate(source = "CEC-QFER") ->
    QFER_gen_RHP_wide

  obs_grid_blank |>
    rows_update(EIA_monthly_gen_obs_only, by = c("RHPID", "year")) |>
    rows_update(QFER_gen_RHP_wide, by = c("RHPID", "year")) ->
    all_monthly_gen_obs_only

  return(all_monthly_gen_obs_only)
  
}


get_spill_quantile <- function(gen_1980_2019_monthly_obs_by_RHPID,
                               daily_release_selected_by_dam){
  
  gen_1980_2019_monthly_obs_by_RHPID |>
    select(-source) |>
    split(~RHPID) |> 
    map_dfr(function(obs_release_table){

      
      obs_release_table[["RHPID"]][1] -> RHPID
      
      message("Computing spill threshold dam ", RHPID)
      
      obs_release_table |> 
        pivot_longer(-c(year, RHPID), names_to = "month", values_to = "gen") |>
        mutate(month = factor(month, levels = month.abb, ordered = T)) |> 
        mutate(gen = if_else(gen < 0, 0, gen)) |> 
        mutate(allocation = gen / sum(gen), .by = c(year)) |> 
        filter(!is.na(allocation)) ->
        observed_allocations
      
      if(nrow(observed_allocations) == 0){
        return(
          tibble(RHPID = !!RHPID,
                 spill_quantile = NA_real_,
                 RMSE = NA_real_)
        )
      }

      read_parquet(
        paste0(daily_release_selected_by_dam,
               gsub("/", "-", RHPID), ".parquet")
        ) |>
        mutate(year = year(date), month = month(date, label = T)) |> 
        select(year, month, date, flow_cumecs, data_from) |> 
        filter(year %in% unique(observed_allocations[["year"]])) |> 
        # REMOVE LINE BELOW AFTER 3042 FLOWS ARE CORRECTED!
        mutate(flow_cumecs = if_else(RHPID == "3402_FORT LOUDOUN" & flow_cumecs == 0,
                       NA_real_, flow_cumecs)) ->
        release_pre_partition
      
      # flow_priority
      
      # stick to one inflow data source
      (release_pre_partition |> 
        count(data_from) |> 
        left_join(flow_priority, by = join_by(data_from)) |> 
        arrange(rank) |> pull(data_from))[1] ->
        flow_source_to_use

      release_pre_partition |> 
        filter(data_from == flow_source_to_use) |> 
        summarise(flow = mean(flow_cumecs, na.rm = T),
               .by = c(year, month)) |> 
        mutate(flow_allocation = flow / sum(flow), .by = c(year)) |> 
        left_join(observed_allocations, by = join_by(year, month)) ->
        monthly_flow_and_generation 
      
      monthly_flow_and_generation |> split(~year) |> 
        map_dfr(function(flow_gen_yr){
          
          suppressWarnings(
            tibble(
              year = flow_gen_yr[["year"]][1],
              sp_cor = cor(flow_gen_yr$flow, flow_gen_yr$gen,
                           method = "spearman")
            )
          )


        }) |> 
        filter(sp_cor > 0.7) |> pull(year) -> years_to_analyze
      
      if(length(years_to_analyze) == 0){
        return(
          tibble(RHPID = !!RHPID,
                 spill_quantile = NA_real_,
                 RMSE = NA_real_)
        )
      }
      

      if(all(is.na(release_pre_partition[["flow_cumecs"]]))){
        return(
          tibble(RHPID = !!RHPID,
                 spill_quantile = NA_real_,
                 RMSE = NA_real_)
        )
      }
            
      if(all(release_pre_partition[["flow_cumecs"]] == 0)){
        return(
          tibble(RHPID = !!RHPID,
                 spill_quantile = NA_real_,
                 RMSE = NA_real_)
        )
      }
      

      compute_allocation_RMSE <- function(spill_quantile = 0.9){
        
        quantile(release_pre_partition[["flow_cumecs"]],
                 spill_quantile, na.rm = T)[[1]] ->
          threshold
        
        release_pre_partition |> 
          mutate(penstock_release = if_else(flow_cumecs > threshold,
                                            threshold, flow_cumecs)) |> 
          summarise(mean_flow = mean(penstock_release, na.rm = T),
                    .by = c(year, month)) |> 
          mutate(flow_allocation = mean_flow / sum(mean_flow),
                 .by = year) |> 
          left_join(observed_allocations,
                    by = join_by(year, month)) |> 
          filter(year %in% years_to_analyze) -> allocations
        
        error <- allocations[["allocation"]] - allocations[["flow_allocation"]]
        rmse <- sqrt(mean(error ^ 2, na.rm = T))
        
        return(rmse)

      }
      
      ecdf_flow <- ecdf(release_pre_partition[["flow_cumecs"]])
      
      set_lower_limit <- ecdf_flow(0)
      
      initial <- (set_lower_limit + 1) / 2

      
      optim(par = initial, fn = compute_allocation_RMSE,
            lower = max(0.01, set_lower_limit), upper = 0.999,
            method = "Brent") -> optim_result
      
      
      tibble(RHPID = !!RHPID,
             spill_quantile = optim_result[["par"]],
             RMSE = optim_result[["value"]])
      
    }) -> all_spill_thresholds_computed

}


get_daily_penstock_release <- function(spill_quantiles,
                                       daily_release_selected_by_dam){

  spill_quantiles |> 
    mutate(spill_quantile = if_else(
      is.na(spill_quantile),
      mean(spill_quantile, na.rm = T),
      spill_quantile)
    ) |> select(RHPID, spill_quantile) |> 
    pmap(function(RHPID, spill_quantile){
      
      read_parquet(
        paste0(daily_release_selected_by_dam,
               gsub("/", "-", RHPID), ".parquet")) ->
        daily_flow_table
      
      daily_flow_table[["RHPID"]] |> unique() -> RHPID
      gsub("/", "-", RHPID) -> RHPID_fn
      
      daily_flow_table |> 
        mutate(cutoff = quantile(flow_cumecs,
                                    spill_quantile, na.rm = T),
                  .by = data_from) |> 
        mutate(
          penstock = if_else(
            flow_cumecs > cutoff, cutoff, flow_cumecs),
            spill = flow_cumecs - penstock
          ) |> 
        select(date, total_from = data_from,
               total = flow_cumecs,
               penstock, spill) |> 
        write_parquet(paste0(
          "data/internals/daily_release_selected_and_partitioned/",
          RHPID_fn, ".parquet"
        ))
      })
  

  return("data/internals/daily_release_selected_and_partitioned/")
  
}
